seqlength = 4096
batchsize = 1
accumulate_steps =1024
train_tokens = 100000000
theoryflops = -1
epochs = 1
flashattn = True
recompute = True
tensor_parallel = 4
pipeline_parallel = 8
